In [108]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn. model_selection import train_test_split
from sklearn. model_selection import RepeatedStratifiedKFold
from sklearn. model_selection import cross_val_score
from sklearn. discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

random.seed(42)

sns.set_theme(style="ticks")
In [109]:
df = pd.read_csv('winequality-red2.csv', delimiter=',')
df = df.assign(quality = df['quality'] >= 6)
print(df.head())
print(df.shape)
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4    False  
1      9.8    False  
2      9.8    False  
3      9.8     True  
4      9.4    False  
(1599, 12)
In [110]:
df1 = df[['fixed acidity', 'volatile acidity', 'residual sugar', 'chlorides', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']]
In [111]:
df1_train, df1_test = train_test_split(df1, test_size=0.2)
In [112]:
plt.show(sns.pairplot(df1_train.iloc[:, :-1]))
In [113]:
plt.show(sns.pairplot(df1_train.iloc[:, :-1], kind="kde"))
In [114]:
df1_train.insert(1, 'fixed_acidity_log', np.log(df1_train['fixed acidity']))
df1_train.insert(1, 'residual_sugar_log', np.log(df1_train['residual sugar']))
df1_train.insert(1, 'volatile_acidity', df1_train['volatile acidity'])
df1_train.insert(1, 'chlorides_log', np.log(df1_train['chlorides']))
df1_train.insert(1, 'total_sulfur_dioxide_log', np.log(df1_train['total sulfur dioxide']))
df1_train.insert(1, 'pH_log', np.log(df1_train['pH']))
df1_train.insert(1, 'sulphates_log', np.log(df1_train['sulphates']))
df1_train.insert(1, 'alcohol_log', np.log(df1_train['alcohol']))
In [115]:
df1_train = df1_train.drop(['fixed acidity', 'volatile acidity', 'residual sugar', 'chlorides', 'total sulfur dioxide', 'pH', 'sulphates', 'alcohol'], axis = 1)
In [116]:
plt.show(sns.pairplot(df1_train, hue = 'quality', diag_kind='hist'))
In [117]:
df1_train = df1_train.drop(df1_train[df1_train.chlorides_log < -4].index)
df1_train = df1_train.drop(df1_train[df1_train.total_sulfur_dioxide_log > 5.1].index)
df1_train = df1_train.drop(df1_train[df1_train.alcohol_log > 2.7].index)
In [118]:
plt.show(sns.pairplot(df1_train, hue = 'quality', diag_kind='hist'))
In [119]:
df1_test.insert(1, 'fixed_acidity_log', np.log(df1_test['fixed acidity']))
df1_test.insert(1, 'residual_sugar_log', np.log(df1_test['residual sugar']))
df1_test.insert(1, 'volatile_acidity', df1_test['volatile acidity'])
df1_test.insert(1, 'chlorides_log', np.log(df1_test['chlorides']))
df1_test.insert(1, 'total_sulfur_dioxide_log', np.log(df1_test['total sulfur dioxide']))
df1_test.insert(1, 'pH_log', np.log(df1_test['pH']))
df1_test.insert(1, 'sulphates_log', np.log(df1_test['sulphates']))
df1_test.insert(1, 'alcohol_log', np.log(df1_test['alcohol']))
df1_test = df1_test.drop(['fixed acidity', 'volatile acidity','residual sugar', 'chlorides', 'total sulfur dioxide', 'pH', 'sulphates', 'alcohol'], axis = 1)
In [120]:
X_train = df1_train.iloc[:, :-1].to_numpy()
X_test = df1_test.iloc[:, :-1].to_numpy()
y_train = df1_train['quality'].astype(int).to_numpy()
y_test = df1_test['quality'].astype(int).to_numpy()
In [123]:
mlp = MLPClassifier(hidden_layer_sizes=(1000,), activation='relu',
                    solver='adam',
                    alpha=1e-4, verbose=False, learning_rate_init=.0001, max_iter=1, warm_start=True, early_stopping = False)
mlp_score1 = np.array([])
mlp_score2 = np.array([])
X_train1, X_train2, y_train1, y_train2 = train_test_split(X_train, y_train, train_size=0.75, stratify=y_train)
In [124]:
for i in range(1000):
    mlp.fit(X_train1, y_train1)
    mlp_score1 = np.append(mlp_score1, mlp.score(X_train1, y_train1))
    mlp_score2 = np.append(mlp_score2, mlp.score(X_train2, y_train2))
plt.figure()
plt.plot(1 - mlp_score1)
plt.plot(1 - mlp_score2)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','val'], loc = 'upper right')
plt.show()    
/opt/anaconda3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:702: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1) reached and the optimization hasn't converged yet.
  warnings.warn(
In [125]:
GRID = [
    {'estimator': [MLPClassifier()],
     'estimator__solver': ['adam', 'sgd'],
     'estimator__learning_rate_init': [0.1, 0.001, 0.0001],
     'estimator__max_iter': [500],
     'estimator__hidden_layer_sizes': [(300,), (250,), (200,), (150,)],
     'estimator__activation': ['logistic', 'relu', 'tanh'],
     'estimator__alpha': [1e-4, 1e-3],
     'estimator__early_stopping': [True, False]
     }
]

PIPELINE = Pipeline([('scaler', None), ('estimator', MLPClassifier())])
In [126]:
grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID, 
                            scoring=make_scorer(accuracy_score),# average='macro'), 
                            n_jobs=-1, cv=5, refit=True, verbose=1, 
                            return_train_score=True)
In [127]:
grid_search.fit(X_train, y_train)
Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Out[127]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', None),
                                       ('estimator', MLPClassifier())]),
             n_jobs=-1,
             param_grid=[{'estimator': [MLPClassifier(hidden_layer_sizes=(300,),
                                                      max_iter=500)],
                          'estimator__activation': ['logistic', 'relu', 'tanh'],
                          'estimator__alpha': [0.0001, 0.001],
                          'estimator__early_stopping': [True, False],
                          'estimator__hidden_layer_sizes': [(300,), (250,),
                                                            (200,), (150,)],
                          'estimator__learning_rate_init': [0.1, 0.001, 0.0001],
                          'estimator__max_iter': [500],
                          'estimator__solver': ['adam', 'sgd']}],
             return_train_score=True, scoring=make_scorer(accuracy_score),
             verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', None),
                                       ('estimator', MLPClassifier())]),
             n_jobs=-1,
             param_grid=[{'estimator': [MLPClassifier(hidden_layer_sizes=(300,),
                                                      max_iter=500)],
                          'estimator__activation': ['logistic', 'relu', 'tanh'],
                          'estimator__alpha': [0.0001, 0.001],
                          'estimator__early_stopping': [True, False],
                          'estimator__hidden_layer_sizes': [(300,), (250,),
                                                            (200,), (150,)],
                          'estimator__learning_rate_init': [0.1, 0.001, 0.0001],
                          'estimator__max_iter': [500],
                          'estimator__solver': ['adam', 'sgd']}],
             return_train_score=True, scoring=make_scorer(accuracy_score),
             verbose=1)
Pipeline(steps=[('scaler', None), ('estimator', MLPClassifier())])
None
MLPClassifier()
In [128]:
print(grid_search.score(X_train, y_train))
print(grid_search.best_params_)
0.7605965463108321
{'estimator': MLPClassifier(hidden_layer_sizes=(300,), max_iter=500), 'estimator__activation': 'relu', 'estimator__alpha': 0.0001, 'estimator__early_stopping': False, 'estimator__hidden_layer_sizes': (300,), 'estimator__learning_rate_init': 0.001, 'estimator__max_iter': 500, 'estimator__solver': 'adam'}
In [129]:
print("Test set score: %f" % grid_search.score(X_test, y_test))
Test set score: 0.715625
In [130]:
dff = pd.concat([pd.DataFrame(X_train), pd.DataFrame(X_test)])
dff1 = pd.concat([pd.DataFrame(y_train), pd.DataFrame(y_test)])
normalizedStd_df = (dff-dff.mean())/dff.std()
normalizedStd_df['quality'] = dff1
normalizedMinMax_df=(dff-dff.min())/(dff.max()-dff.min())
normalizedMinMax_df['quality'] = dff1
df_normalizedStd_train, df_normalizedStd_test = train_test_split(normalizedStd_df, test_size=0.2)
df_normalizedMinMax_train, df_normalizedMinMax_test = train_test_split(normalizedMinMax_df, test_size=0.2)
In [131]:
X_normalizedStd_train = df_normalizedStd_train.iloc[:, :-1].to_numpy()
X_normalizedStd_test = df_normalizedStd_test.iloc[:, :-1].to_numpy()
y_normalizedStd_train = df_normalizedStd_train['quality'].astype(int).to_numpy()
y_normalizedStd_test = df_normalizedStd_test['quality'].astype(int).to_numpy()
In [133]:
grid_search.fit(X_normalizedStd_train, y_normalizedStd_train)
Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Out[133]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', None),
                                       ('estimator', MLPClassifier())]),
             n_jobs=-1,
             param_grid=[{'estimator': [MLPClassifier(alpha=0.001,
                                                      early_stopping=True,
                                                      hidden_layer_sizes=(200,),
                                                      learning_rate_init=0.1,
                                                      max_iter=500,
                                                      solver='sgd')],
                          'estimator__activation': ['logistic', 'relu', 'tanh'],
                          'estimator__alpha': [0.0001, 0.001],
                          'estimator__early_stopping': [True, False],
                          'estimator__hidden_layer_sizes': [(300,), (250,),
                                                            (200,), (150,)],
                          'estimator__learning_rate_init': [0.1, 0.001, 0.0001],
                          'estimator__max_iter': [500],
                          'estimator__solver': ['adam', 'sgd']}],
             return_train_score=True, scoring=make_scorer(accuracy_score),
             verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', None),
                                       ('estimator', MLPClassifier())]),
             n_jobs=-1,
             param_grid=[{'estimator': [MLPClassifier(alpha=0.001,
                                                      early_stopping=True,
                                                      hidden_layer_sizes=(200,),
                                                      learning_rate_init=0.1,
                                                      max_iter=500,
                                                      solver='sgd')],
                          'estimator__activation': ['logistic', 'relu', 'tanh'],
                          'estimator__alpha': [0.0001, 0.001],
                          'estimator__early_stopping': [True, False],
                          'estimator__hidden_layer_sizes': [(300,), (250,),
                                                            (200,), (150,)],
                          'estimator__learning_rate_init': [0.1, 0.001, 0.0001],
                          'estimator__max_iter': [500],
                          'estimator__solver': ['adam', 'sgd']}],
             return_train_score=True, scoring=make_scorer(accuracy_score),
             verbose=1)
Pipeline(steps=[('scaler', None), ('estimator', MLPClassifier())])
None
MLPClassifier()
In [135]:
print(grid_search.score(X_normalizedStd_train, y_normalizedStd_train))
print(grid_search.best_params_)
0.7733333333333333
{'estimator': MLPClassifier(alpha=0.001, early_stopping=True, hidden_layer_sizes=(200,),
              learning_rate_init=0.1, max_iter=500, solver='sgd'), 'estimator__activation': 'relu', 'estimator__alpha': 0.001, 'estimator__early_stopping': True, 'estimator__hidden_layer_sizes': (200,), 'estimator__learning_rate_init': 0.1, 'estimator__max_iter': 500, 'estimator__solver': 'sgd'}
In [136]:
print("Test set score: %f" % grid_search.score(X_normalizedStd_test, y_normalizedStd_test))
Test set score: 0.764890
In [139]:
X_normalizedMinMax_train = df_normalizedMinMax_train.iloc[:, :-1].toM_numpy()
X_normalizedMinMax_test = df_normalizedMinMax_test.iloc[:, :-1].to_numpy()
y_normalizedMinMax_train = df_normalizedMinMax_train['quality'].astype(int).to_numpy()
y_normalizedMinMax_test = df_normalizedMinMax_test['quality'].astype(int).to_numpy()
In [140]:
grid_search.fit(X_normalizedMinMax_train, y_normalizedMinMax_train)
Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Out[140]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', None),
                                       ('estimator', MLPClassifier())]),
             n_jobs=-1,
             param_grid=[{'estimator': [MLPClassifier(alpha=0.001,
                                                      hidden_layer_sizes=(200,),
                                                      learning_rate_init=0.1,
                                                      max_iter=500,
                                                      solver='sgd')],
                          'estimator__activation': ['logistic', 'relu', 'tanh'],
                          'estimator__alpha': [0.0001, 0.001],
                          'estimator__early_stopping': [True, False],
                          'estimator__hidden_layer_sizes': [(300,), (250,),
                                                            (200,), (150,)],
                          'estimator__learning_rate_init': [0.1, 0.001, 0.0001],
                          'estimator__max_iter': [500],
                          'estimator__solver': ['adam', 'sgd']}],
             return_train_score=True, scoring=make_scorer(accuracy_score),
             verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', None),
                                       ('estimator', MLPClassifier())]),
             n_jobs=-1,
             param_grid=[{'estimator': [MLPClassifier(alpha=0.001,
                                                      hidden_layer_sizes=(200,),
                                                      learning_rate_init=0.1,
                                                      max_iter=500,
                                                      solver='sgd')],
                          'estimator__activation': ['logistic', 'relu', 'tanh'],
                          'estimator__alpha': [0.0001, 0.001],
                          'estimator__early_stopping': [True, False],
                          'estimator__hidden_layer_sizes': [(300,), (250,),
                                                            (200,), (150,)],
                          'estimator__learning_rate_init': [0.1, 0.001, 0.0001],
                          'estimator__max_iter': [500],
                          'estimator__solver': ['adam', 'sgd']}],
             return_train_score=True, scoring=make_scorer(accuracy_score),
             verbose=1)
Pipeline(steps=[('scaler', None), ('estimator', MLPClassifier())])
None
MLPClassifier()
In [141]:
print(grid_search.score(X_normalizedMinMax_train, y_normalizedMinMax_train))
print(grid_search.best_params_)
0.7631372549019608
{'estimator': MLPClassifier(alpha=0.001, hidden_layer_sizes=(200,), learning_rate_init=0.1,
              max_iter=500, solver='sgd'), 'estimator__activation': 'relu', 'estimator__alpha': 0.001, 'estimator__early_stopping': False, 'estimator__hidden_layer_sizes': (200,), 'estimator__learning_rate_init': 0.1, 'estimator__max_iter': 500, 'estimator__solver': 'sgd'}
In [142]:
print("Test set score: %f" % grid_search.score(X_normalizedMinMax_test, y_normalizedMinMax_test))
Test set score: 0.733542